#!/usr/bin/env python
# -*- coding:utf-8 -*-
import copy
import hashlib
import json
import re
import requests
import pymongo
import ConfigParser

import sys
import threading
import urllib
from bs4 import BeautifulSoup
import datetime
import thread
from scpy.logger import get_logger
from scpy import xawesome_mail
from scpy.xawesome_time import parse_time
from scpy.xawesome_codechecker import get_runner

logger = get_logger(__file__)

reload(sys)
sys.setdefaultencoding("utf-8")

import utils

format_company_name = lambda company_name: company_name.replace(u'(', u'（').replace(u')', u'）')


class QichachaCrawler(object):
    NOW_COOKIE = None

    TYPE_COMPANY, TYPE_SHAREHOLDER, TYPE_LAWSUIT = 0, 1, 2  # 公司 法人 诉讼
    MODE_SINGLE, MODE_PAGE, MODE_ALL = 0, 1, 2
    SEARCH_URL = 'http://qichacha.com/search?key=%s&type=%d'
    INDEX_URL = 'http://qichacha.com'

    PATTERN_YY_MM_DD = re.compile(ur'\d{4}.\d{2}.\d{2}')
    PATTERN_SPACE = re.compile(ur'\s')
    PATTERN_ITEM = re.compile(ur'注册号：(.*?)法人:(.*?)状态：(.*?)成立日期：(.*?)$')
    PATTERN_QICHACHA_HOST = re.compile(ur'http[s]?://tm-image.qichacha.com')

    DATA_FORMAT = {
        '_id': '',
        'companyName': '',
        'province': '',
        'basicList': [],
        'shareHolderList': [],
        'personList': [],
        'punishBreakList': [],
        'punishedList': [],
        'alidebtList': [],
        'entinvItemList': [],
        'frinvList': [],
        'frPositionList': [],
        'alterList': [],
        'filiationList': [],
        'caseInfoList': [],
        'sharesFrostList': [],
        'sharesImpawnList': [],
        'morDetailList': [],
        'morguaInfoList': [],
        'shixinList': [],
        'executedList': [],
        'liquidationList': [],
        'yearReportList': [],
        'trademarkList': [],
        'commentList': [],
        'baikeList': [],
    }
    BASE_DATA_FORMAT = {
        'enterpriseName': '',  # 企业名称
        'frName': '',  # 法人姓名
        'regNo': '',  # 工商注册号
        'regCap': '',  # 注册资金(单位:万元)
        'regCapCur': '',  # 注册币种
        'esDate': '',  # 开业日期(YYYY-MM-DD)
        'openFrom': '',  # 经营期限自(YYYY-MM-DD)
        'openTo': '',  # 经营期限至(YYYY-MM-DD)
        'enterpriseType': '',  # 企业(机构)类型
        'enterpriseStatus': '',  # 经营状态(在营、注销、吊销、其他)
        'cancelDate': '',  # 注销日期
        'revokeDate': '',  # 吊销日期
        'address': '',  # 注册地址
        'abuItem': '',  # 许可经营项目
        'cbuItem': '',  # 一般经营项目
        'operateScope': '',  # 经营(业务)范围
        'operateScopeAndForm': '',  # 经营(业务)范围及方式
        'regOrg': '',  # 登记机关
        'ancheYear': '',  # 最后年检年度
        'ancheDate': '',  # 最后年检日期
        'industryPhyCode': '',  # 行业门类代码
        'industryPhyName': '',  # 行业门类名称
        'industryCode': '',  # 国民经济行业代码
        'industryName': '',  # 国民经济行业名称
        'recCap': '',  # 实收资本
        'oriRegNo': '',  # 原注册号
    }
    INVESTOR_DATA_FORMAT = {
        'shareholderName': '',  # 股东名称
        'shareholderType': '',  # 股东类型
        'subConam': '',  # 认缴出资额(单位:万元)
        'regCapCur': '',  # 币种
        'conDate': '',  # 出资日期
        'fundedRatio': '',  # 出资比例
        'country': '',  # 国别
    }
    YEAR_REPORT_FORMAT = {
        'baseInfo': {
            'regNo': '',  # 工商注册号
            'phone': '',  # 企业联系电话
            'email': '',  # 电子邮箱
            'zipcode': '',  # 邮政编码
            'enterpriseStatus': '',  # 企业经营状态
            'haveWebsite': '',  # 是否有网站或网店
            'buyEquity': '',  # 企业是否有投资信息或购买其他公司股权
            'equityTransfer': '',  # 有限责任公司本年度是否发生股东股权转让
            'address': '',  # 住所
            'employeeCount': '',  # 从业人数
        },
        'website': {
            'type': '',  # 类型
            'name': '',  # 名称
            'link': '',  # 网址
        },
        'investorInformations': [],  # 发起人及出资信息 : INVESTOR_YR_FORMAT
        'assetsInfo': {  # 企业资产状况信息
                         'generalAssets': '',  # 资产总额
                         'ownersEequity': '',  # 所有者权益合计
                         'revenue': '',  # 营业总收入
                         'profit': '',  # 利润总额
                         'mainRevenue': '',  # 营业总收入中主营业务收入
                         'netProfit': '',  # 净利润
                         'taxPayment': '',  # 纳税总额
                         'liability': '',  # 负债总额
                         },
        'equityChangeInformations': [],
    # 股权变更信息  {'shareholderName': '股东', 'equityAfter': '变更前股权比例', 'equityBefore': '变更后股权比例', 'time': '股权变更日期'}
        'changeRecords': []
    # 修改记录  {'changedItem': '修改事项', 'beforeChange': '修改前', 'afterChange': '修改后', 'time': '修改日期'}
    }
    INVESTOR_YR_FORMAT = {
        'shareholderName': '',  # 发起人
        'subConam': '',  # 认缴出资额（万元）
        'subConDate': '',  # 认缴出资时间
        'subConType': '',  # 认缴出资方式
        'paidConMoney': '',  # 实缴出资额（万元）
        'paidTime': '',  # 出资时间
        'paidType': '',  # 出资方式
    }
    BAIKE_FORMAT = {
        'emails': [],
        'phones': [],
        'websites': [],
        'products': [],
        'datas': [],
        'notices': [],
    }
    YEAR_REPORT_KV_MAP = {
        u'注册号': 'regNo',
        u'企业经营状态': 'enterpriseStatus',
        u'企业联系电话': 'phone',
        u'电子邮箱': 'email',
        u'邮政编码': 'zipcode',
        u'是否有网站或网店': 'haveWebsite',
        u'有限责任公司本年度是否发生股东股权转让': 'equityTransfer',
        u'企业是否有投资信息或购买其他公司股权': 'buyEquity',
        u'从业人数': 'employeeCount',
        u'住所': 'address',
    }
    ASSETS_KV_MAP = {
        u'资产总额': 'generalAssets',
        u'所有者权益合计': 'ownersEequity',
        u'营业总收入': 'revenue',
        u'利润总额': 'profit',
        u'营业总收入中主营业务收入': 'mainRevenue',
        u'净利润': 'netProfit',
        u'纳税总额': 'taxPayment',
        u'负债总额': 'liability',
    }
    MARK_FORMAT = {
        'regNumber': '',  # 注册号
        'category': '',  # 分类
        'categoryId': '',  # 分类编号
        'flowStatus': '',  # 商标状态
        'person': '',  # 申请人
        'applyDate': '',  # 申请日期
        'validPeriodFrom': '',  # 使用期限（开始）
        'validPeriodTo': '',  # 使用期限（结束）
        'agent': '',  # 代理机构
        'listGroupItems': [],  # 商品/服务列表
        'flowItems': [],  # 商标流程
        'name': '',  # 商标名称
        'imageUrl': '',  # 商标图片
        'ID': '',  # 商标ID
    }
    MARK_KV_MAP = {
        'RegNo': 'regNumber',
        'ID': 'ID',
        'Category': 'category',
        'CategoryId': 'categoryId',
        'FlowStatus': 'flowStatus',
        'Person': 'person',
        'ApplyDate': 'applyDate',
        'Agent': 'agent',
        'ListGroupItems': 'listGroupItems',
        'Name': 'name',
        'FlowItems': 'flowItems',
        'ImageUrl': 'imageUrl',
    }
    SS_KV_MAP = {
        u'案号': 'caseCode',
        u'执行法院': 'courtName',
        u'立案时间': 'regDate',
        u'被执行人名称': 'realName',
        u'发布时间': 'publishDate',
        u'身份证号码/组织机构代码': 'idNumber',
        u'执行标的': 'subject',
        u'省份': 'areaName',
        u'法律生效文书确定的义务': 'duty',
        u'执行依据文号': 'gistId',
        u'法人名称': 'frName',
        u'名称': 'companyName',
        u'被执行人的履行情况': 'performance',
        u'做出执行依据单位': 'gistUnit',
    }

    def __init__(self, key, type, province=''):
        """
        :param key:  查询的关键字
        :param type: 查询方式:TYPE_COMPANY|TYPE_SHAREHOLDER|TYPE_LAWSUIT=>企业|股东|诉讼
        """
        if isinstance(key, str):
            key = unicode(key)
        # self.key = self.deal_key(key.strip())
        self.key = key.strip()
        self.type = type
        self.province = province
        self.request = requests.Session()
        self.request.headers[
            'User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'
        self.request.headers['Cookie'] = self.get_cookie()
        logger.info('init qichacha crawler done.')

    # def deal_key(self, key):
    #     try:
    #         key = unicode(key)
    #     except:
    #         pass
    #     pattern = re.compile(ur'^(.+?公司).+?分公司')
    #     name = pattern.findall(key)
    #     if name:
    #         return name[0]
    #     return key

    @classmethod
    def url_encode(cls, key):
        return urllib.quote(key.encode('utf-8'))

    def build_search_url(self):
        url = QichachaCrawler.SEARCH_URL % (self.url_encode(self.key), self.type)
        if self.province:
            url += '&province=%s' % self.province
        return url

    @classmethod
    def pop_cookie(cls):
        try:
            coll = utils.DB[utils.QICHACHA_COOKIE_COLL]
            if QichachaCrawler.NOW_COOKIE:
                coll.update_one({'_id': QichachaCrawler.NOW_COOKIE},
                                {'$set': {'_id': QichachaCrawler.NOW_COOKIE, 'valuable': False}},
                                True)
            new_cookie = coll.find_one({"valuable": True})['_id']
            QichachaCrawler.NOW_COOKIE = new_cookie
            return new_cookie
        except Exception, e:
            logger.exception(e)
            return None

    def get_cookie(self, next=False):
        if next:
            # xawesome_mail.send_mail(
            #     u'企查查Cookie过期了～', u'Cookie【%s】 已过期，现在更换新的Cookie。' % self.NOW_COOKIE,
            #     ['xu.du@socialcredits.cn'], sender_nickname='企查查警告')
            cookie = self.pop_cookie()
            logger.info('pop a new cookie [%s]' % cookie)
        elif QichachaCrawler.NOW_COOKIE:
            cookie = QichachaCrawler.NOW_COOKIE
        else:
            cookie = self.pop_cookie()
        if not cookie:
            raise Exception('No cookie found')
        return cookie

    def get_url_content(self, url, deep=0):
        if deep == 3:
            return None
        content = self.request.get(url).content
        redirect = re.findall(ur"<script>location.href=[\"'](http://www.qichacha.com.+?)[\"'];</script>", content)
        if redirect:
            print redirect[0]
            return self.get_url_content(redirect[0], deep + 1)
        if u'您的账户存在异常' in content:
            logger.info(u'Qichacha Crawler cookie Invalid [%s]' % self.key)
            xawesome_mail.send_mail(
                u'企查查-[您的账户存在异常]', u'html源码中出现“您的账户存在异常”，现在的cookie是【%s】' % self.NOW_COOKIE,
                ['xu.du@socialcredits.cn'], sender_nickname='企查查警告')
            self.request.headers['Cookie'] = self.get_cookie(next=True)
            return self.get_url_content(url, deep + 1)
        return content

    def post(self, url, data, headers={}, deep=0):
        if deep == 3:
            return None
        try:
            if headers:
                content = self.request.post(url, data=data, headers=headers).content
            else:
                content = self.request.post(url, data=data).content
            if u'您的账户存在异常' in content:
                logger.info(u'Qichacha Crawler cookie Invalid [%s]' % self.key)
                xawesome_mail.send_mail(
                    u'企查查-[您的账户存在异常]', u'html源码中出现“您的账户存在异常”，现在的cookie是【%s】' % self.NOW_COOKIE,
                    ['xu.du@socialcredits.cn'], sender_nickname='企查查警告')
                self.request.headers['Cookie'] = self.get_cookie(next=True)
                return self.post(url, data, headers=headers, deep=deep + 1)
            return content
        except Exception, e:
            return self.post(url, data, headers=headers, deep=deep + 1)

    def parse_target_item(self, li):
        item = {'company': '', 'regNumber': '', 'frName': '', 'esDate': '', 'enterpriseStatus': '', 'link': ''}
        a = li.find('h3', attrs={'class': 'site-list-title'}).find('a')
        item['company'] = a.getText().strip()
        item['link'] = self.INDEX_URL + a['href']
        text = ''.join([div.getText().replace(' ', '') for div in li.find_all('div', attrs={'class': 'site-text'})])
        try:
            text = self.PATTERN_SPACE.sub('', text)
            values = self.PATTERN_ITEM.findall(text)[0]
            item['regNumber'] = values[0]
            item['frName'] = values[1]
            item['enterpriseStatus'] = values[2]
            item['esDate'] = parse_time(values[3])
        except Exception, e:
            logger.exception(e)
        return item

    def parse_target_list(self, soup):
        targets = []
        for index, li in enumerate(soup.find_all('li', attrs={'class': 'site-list-item company-list-item'})):
            try:
                targets.append(self.parse_target_item(li))
            except Exception, e:
                logger.exception(e)
        return targets

    def find_next_page(self, soup):
        try:
            return self.INDEX_URL + filter(lambda x: u'下一页' in x.getText(), map(lambda li: li.find('a'), soup.find('ul',
                                                                                                                   attrs={
                                                                                                                       'class': 'pager'}).find_all(
                'li')))[0]['href']
        except:
            return False

    def parse_base_info(self, soup, target):
        KV_MAP = {
            u'注册号': 'regNo',
            u'经营状态': 'enterpriseStatus',
            u'公司类型': 'enterpriseType',
            u'成立日期': 'esDate',
            u'法定代表': 'frName',
            u'注册资本': 'regcap',
            u'营业期限': 'openFromTo',
            u'登记机关': 'regOrg',
            # u'发照日期': 'fzrq',
            u'住所': 'address',
            u'经营范围': 'operateScope',
        }
        result = copy.deepcopy(self.BASE_DATA_FORMAT)
        result['enterpriseName'] = target['companyName']
        for li in soup.find('ul', attrs={'class': 'company-info'}).find_all('li'):
            text = li.getText().strip()
            kv = map(lambda x: x.strip(), re.split(ur'：', text, maxsplit=1))
            if kv[0] in KV_MAP:
                result[KV_MAP[kv[0]]] = kv[1]
        result['address'] = result['address'].replace(u'查看地图', '').strip()
        if 'openFromTo' in result and result['openFromTo']:  # 1999-02-05 - 无固定期限   1998-10-14 - 2018-10-13
            temp = result['openFromTo'].split(ur' - ')
            if temp:
                if self.PATTERN_YY_MM_DD.match(temp[0].strip()):
                    result['openFrom'] = self.PATTERN_YY_MM_DD.findall(temp[0])[0] + ' 00:00:00'
            if len(temp) > 1:
                if self.PATTERN_YY_MM_DD.match(temp[1].strip()):
                    result['openTo'] = self.PATTERN_YY_MM_DD.findall(temp[1])[0] + ' 00:00:00'
            del result['openFromTo']
        try:
            if result['regcap']:
                result['regCap'] = re.findall(ur'([\d|\.]+)', result['regcap'].replace(',', ''))[0]
                result['regCapCur'] = result['regcap'].replace(',', '').replace(result['regCap'], '').strip()
            del result['regcap']
        except Exception, e:
            pass
        if result['esDate']:
            result['esDate'] += ' 00:00:00'
        return result

    def parse_investors_info(self, soup):
        result = []
        for div in soup.find_all('div', attrs={'class': 'detail-partner-list'}):
            item = copy.deepcopy(self.INVESTOR_DATA_FORMAT)
            for p in div.find_all('p'):
                p_text = p.getText().strip()
                if u'认缴出资额' in p_text:
                    item['subConam'] = re.findall(u'\d+[\.]?\d*', p_text)[0].strip()
                elif u'实缴出资额' in p_text:
                    pass
                    # item['subConam'] = re.findall(u'\d+', p_text)[0]
                elif u'实缴时间' in p_text:
                    date = map(int, re.findall(ur'(\d+).(\d+).(\d+)', p_text)[0])
                    item['conDate'] = '%4d-%02d-%02d 00:00:00' % (date[0], date[1], date[2])
                elif p.find('span'):
                    item['shareholderName'] = p.find('span').getText().strip()
            result.append(item)
        return result

    def parse_members_info(self, soup):
        result = []
        ul = soup.find('ul', attrs={'class': 'people-list'})
        if not ul:
            return result
        for li in ul.find_all('li'):
            item = {'name': '', 'position': '', 'sex': ''}
            split = li.getText().strip().split('\n')
            if not split:
                continue
            item['name'] = split[0].strip()
            if len(split) > 1:
                item['position'] = split[1].strip()
            result.append(item)
        return result

    def parse_entinv(self, url):
        result = []
        content = self.get_url_content(url)
        soup = BeautifulSoup(content)
        ul = soup.find('ul', attrs={"class": 'site-list-group infolist'})
        if not ul:
            return result
        for li in ul.find_all('li', attrs={'class': 'site-list-item company-list-item'}):
            item = {
                'entName': li.find('h3', attrs={'class': 'site-list-title'}).find('a').getText().strip(),  # 企业 (机构 )名称
                'regNo': '',  # 注册号
                'entType': '', 'regCap': '', 'regCapcur': '', 'canDate': '',
                'revDate': '', 'entStatus': '', 'regOrg': '', 'subConam': '',
                'currency': '', 'fundedRatio': '', 'esDate': '', 'name': '',
            }
            for div in li.find_all('div', attrs={'class': 'site-text'}):
                line = div.getText().strip().replace(' ', '')
                if u'注册号' in line:
                    item['regNo'] = re.findall('\d+', line)[0]
                if u'法人' in line:
                    item['name'] = line[line.find(u'法人') + 3:]
                if u'登记机关' in line:
                    item['regOrg'] = line[line.find(u'登记机关') + 5:]
            result.append(item)
        return result

    def select_db(self, reg_nomber):
        return utils.DB['cacheQichacha'].find_one({'_id': reg_nomber})

    def parse_alter_info(self, soup):
        result = []
        div = soup.find('div', attrs={"class": 'company-record'})
        if not div:
            return result

        for tr in div.find_all('tr'):
            try:
                tds = map(lambda x: x.getText().strip(), tr.find_all('td'))
                item = {
                    'altDate': '%s 00:00:00' % tds[3],  # 变更日期
                    'altItem': tds[0],  # 变更事项
                    'altBe': tds[1],  # 变更前内容
                    'altAf': tds[2],  # 变更后内容
                }
                result.append(item)
            except:
                pass

        return result

    def deal_time(self, string):
        try:
            return '%s 00:00:00' % '-'.join(re.findall(ur'(\d{4}).(\d{2}).(\d{2})', string)[0])
        except:
            return ''

    def parse_inv_info(self, table):
        result = []
        for tr in table.find_all('tr'):
            tds = tr.find_all('td')
            if len(tds) < 7:
                continue
            tds = [x.getText().strip() for x in tds]
            result.append({
                'shareholderName': tds[0],
                'subConam': tds[1],
                'subConDate': tds[2],
                'subConType': tds[3],
                'paidConMoney': tds[4],
                'paidTime': self.deal_time(tds[5]),
                'paidType': tds[6],
            })
        return result

    def parse_assets_info(self, table):
        result = {
            'generalAssets': '',  # 资产总额
            'ownersEequity': '',  # 所有者权益合计
            'revenue': '',  # 营业总收入
            'profit': '',  # 利润总额
            'mainRevenue': '',  # 营业总收入中主营业务收入
            'netProfit': '',  # 净利润
            'taxPayment': '',  # 纳税总额
            'liability': '',  # 负债总额
        }
        ths = table.find_all('th')
        tds = table.find_all('td')
        KV_MAP = self.ASSETS_KV_MAP
        for th, td in zip(ths, tds):
            th = th.getText().strip()
            td = td.getText().strip()
            if th in KV_MAP:
                result[KV_MAP[th]] = td
        return result

    def parse_equity_info(self, table):
        result = []
        for tr in table.find_all('tr'):
            tds = tr.find_all('td')
            if len(tds) < 4:
                continue
            tds = [x.getText().strip() for x in tds]
            result.append({
                'shareholderName': tds[0],
                'equityAfter': tds[1],
                'equityBefore': tds[2],
                'time': self.deal_time(tds[3])
            })
        return result

    def parse_change_info(self, table):
        result = []
        for tr in table.find_all('tr'):
            tds = tr.find_all('td')
            if len(tds) < 5:
                continue
            tds = [x.getText().strip() for x in tds[1:]]
            result.append({
                'changedItem': tds[0],
                'beforeChange': tds[1],
                'afterChange': tds[2],
                'time': self.deal_time(tds[3])
            })
        return result

    def parse_yearreport_item(self, item):
        div, table = item
        div = div.getText().strip()
        if u'网站或网店信息' in div:
            tds = table.find_all('td')
            return 'website', {
                'type': tds[0].getText().strip(),
                'name': tds[1].getText().strip(),
                'link': tds[2].find('a')['href']
            }
        if u'发起人及出资信息' in div:
            return 'investorInformations', self.parse_inv_info(table)
        if u'企业资产状况信息' in div:
            return 'assetsInfo', self.parse_assets_info(table)
        if u'股权变更信息' in div:
            return 'equityChangeInformations', self.parse_equity_info(table)
        if u'修改记录' in div:
            return 'changeRecords', self.parse_change_info(table)
        return None

    def parse_one_year_report(self, div):
        div = div.find('div', attrs={'class': 'panel-body'})
        if not div:
            return {}
        result = copy.deepcopy(self.YEAR_REPORT_FORMAT)
        KV_MAP = self.YEAR_REPORT_KV_MAP
        for li in div.find('div', attrs={'class': 'detail-info'}).find_all('li'):
            k1, k2 = [x.strip() for x in li.getText().split(u'：')]
            if k1 in KV_MAP:
                result['baseInfo'][KV_MAP[k1]] = k2
        divs = div.find_all('div', attrs={'style': 'margin-top:30px;margin-bottom:20px;'})
        tabs = div.find_all('table', attrs={'class': 'table table-bordered'})
        for item in zip(divs, tabs):
            try:
                key, value = self.parse_yearreport_item(item)
                if key:
                    result[key] = value
            except:
                pass
        return result

    def parse_year_report(self, url):
        result = []
        content = self.get_url_content(url)
        soup = BeautifulSoup(content)
        div = soup.find_all('div', attrs={"class": 'panel panel-default'})
        if not div:
            return result

        for item in div:
            try:
                temp = self.parse_one_year_report(item)
                if temp:
                    result.append(temp)
            except:
                pass
        return result

    @classmethod
    def get_dict(cls, d, key, default=''):
        if key in d:
            return d[key]
        return default

    def parse_susong_item(self, item):
        trs = item.find_all('tr')
        if not trs:
            return None
        header = item.find('h4')
        result = {}
        for tr in trs:
            try:
                kv = [td.getText().strip() for td in tr.find_all('td')]
                if kv[0] in self.SS_KV_MAP:
                    result[self.SS_KV_MAP[kv[0]]] = kv[1]
            except:
                pass
        if 'regDate' in result and result['regDate']:
            result['regDate'] += ' 00:00:00'
        if 'publishDate' in result and result['publishDate']:
            result['publishDate'] += ' 00:00:00'
        result['category'] = header.getText().replace(u'详情', '').strip()
        if result['category'] == u'失信人':
            return {
                'category': self.get_dict(result, 'category'),
                'frName': self.get_dict(result, 'frName'),
                'companyName': self.get_dict(result, 'companyName'),
                'idNumber': self.get_dict(result, 'idNumber'),
                'gistId': self.get_dict(result, 'gistId'),
                'caseCode': self.get_dict(result, 'caseCode'),
                'duty': self.get_dict(result, 'duty'),
                'performance': self.get_dict(result, 'performance'),
                'courtName': self.get_dict(result, 'courtName'),
                'gistUnit': self.get_dict(result, 'gistUnit'),
                'areaName': self.get_dict(result, 'areaName'),
                'regDate': self.get_dict(result, 'regDate'),
                'publishDate': self.get_dict(result, 'publishDate'),
            }
        elif result['category'] == u'被执行人':
            return {
                'category': self.get_dict(result, 'category'),
                'realName': self.get_dict(result, 'realName'),
                'courtName': self.get_dict(result, 'courtName'),
                'idNumber': self.get_dict(result, 'idNumber'),
                'regDate': self.get_dict(result, 'regDate'),
                'caseCode': self.get_dict(result, 'caseCode'),
                'subject': self.get_dict(result, 'subject'),
            }
        return None

    def parse_susong(self, url):
        shixin_result = []
        executed_result = []
        content = self.get_url_content(url)
        soup = BeautifulSoup(content)
        div = soup.find('div', attrs={'class': 'shixin clearfix'}).find_all('div', attrs={"class": 'modal fade'})
        if not div:
            return shixin_result, executed_result
        for item in div:
            try:
                temp = self.parse_susong_item(item)
                if temp['category'] == u'被执行人':
                    executed_result.append(temp)
                else:
                    shixin_result.append(temp)
            except Exception, e:
                logger.exception(e)
                pass
        return shixin_result, executed_result

    def parse_mark_item(self, a):
        print a['sbid']
        obj = json.loads(self.post('http://qichacha.com/company_shangbiaoView', data={'id': a['sbid']}))['Result']

        result = copy.deepcopy(self.MARK_FORMAT)

        for item in obj:
            if item in self.MARK_KV_MAP:
                result[self.MARK_KV_MAP[item]] = obj[item]
        try:
            tms = obj['ValidPeriod'].split('-')
            result['validPeriodFrom'], result['validPeriodTo'] = parse_time(tms[0]), parse_time(tms[1])
        except Exception, e:
            logger.exception(e)

        for item in result['flowItems']:
            try:
                item['FlowDate'] = parse_time(item['FlowDate'])
            except:
                pass
        try:
            result['applyDate'] = parse_time(result['applyDate'])
        except:
            pass

        if result['imageUrl']:
            result['imageUrl'] = self.PATTERN_QICHACHA_HOST.sub('/tmimg', result['imageUrl'])

        return result

    def parse_mark(self, url):
        mark_result = []
        content = self.get_url_content(url)
        soup = BeautifulSoup(content)
        div = soup.find('div', attrs={'class': 'tab-content'})
        if not div:
            return mark_result
        for item in div.find_all('a', attrs={'class': 'shangbiaoView'}):
            try:
                item_content = self.parse_mark_item(item)
                if item_content:
                    mark_result.append(item_content)
            except Exception, e:
                logger.exception(e)
        return mark_result

    def parse_grade(self, string):
        try:
            point = int(string.strip()[-3]) >> 1
            return (5, point)[1 <= point <= 4]
        except:
            return 0

    def parse_comment_item(self, li):
        h3 = li.find('h3')
        return {
            'userName': h3.find('a').getText().strip(),
            'userUrl': 'http://qichacha.com' + h3.find('a')['href'],
            'time': parse_time(h3.find('span').getText()),
            'grade': self.parse_grade(h3.find('i', attrs={'class': 'star-on'})['style']),
            'content': li.find('div', attrs={'class': 'site-text detail-review'}).getText().strip()
        }

    def parse_comment(self, url):
        result = []
        content = self.get_url_content(url)
        soup = BeautifulSoup(content)
        for li in soup.find_all('li', attrs={'class': 'site-list-item'}):
            try:
                item = self.parse_comment_item(li)
                result.append(item)
            except Exception, e:
                logger.exception(e)
        return result

    def parse_baike_websites(self, url):

        emails = []
        phones = []
        websites = []
        website_temp = []

        content = self.get_url_content(url)
        soup = BeautifulSoup(content)

        contact_info = soup.find('p', attrs={'class': 'contact-info'})
        if contact_info:
            emails = [a.getText().strip() for a in contact_info.find_all('a')]
            data = contact_info.getText().strip()
            data = reduce(lambda string, email: string.replace(email, ''), emails, data).strip()
            phones = [x for x in re.split(ur'\s', data) if x.strip()]

        contact_link = soup.find('p', attrs={'class': 'contact-link'})
        if contact_link:
            website_temp = [{'link': a['href'], 'licenseId': '', 'websiteName': '', 'regTime': ''}
                            for a in contact_link.find_all('a')]

        table = soup.find('table', attrs={'class': 'table corde-table'})
        if not table:
            if (not emails) and (not phones) and (not website_temp):
                return None
            return emails, phones, website_temp

        for tr in table.find_all('tr'):
            try:
                tds = tr.find_all('td', attrs={'class': 'td-content'})
                if not tds:
                    continue
                websites.append({
                    'link': tds[1].getText().strip(),
                    'licenseId': tds[2].getText().strip(),
                    'websiteName': tds[3].getText().strip(),
                    'regTime': parse_time(tds[4].getText().strip())
                })
            except:
                pass
        return emails, phones, websites

    def parse_baike(self, url):
        #     {
        #     'emails': [],
        #     'phones': [],
        #     'websites': [],
        #     'products': [],
        #     'datas': [],
        #     'notices': [],
        # }
        try:
            result = copy.deepcopy(self.BAIKE_FORMAT)
            result['emails'], result['phones'], result['websites'] = self.parse_baike_websites(url + '?tab=website')
            return result
        except Exception, e:
            logger.exception(e)
            return []

    def crawl_one(self, target, forced_latest=False):
        # if not forced_latest:
        #     data = self.select_db(target['regNumber'])
        #     if data:
        #         return data
        result = copy.deepcopy(self.DATA_FORMAT)
        link = 'http://qichacha.com/firm_CN_' + target['KeyNo']

        content = self.get_url_content(link)
        soup = BeautifulSoup(content)

        # 1. 企业照面信息(basicList)
        result['basicList'] = [self.parse_base_info(soup, target)]
        # 2. 企业股东及出资信息(shareHolderList)
        result['shareHolderList'] = self.parse_investors_info(soup)
        # 3. 企业主要管理人员(personList)
        result['personList'] = self.parse_members_info(soup)
        # 10. 企业历史变更信息(alterList)
        result['alterList'] = self.parse_alter_info(soup)
        for a in soup.find('nav', attrs={'id': 'company-nav'}).find_all('a'):
            try:
                title = a.getText()
                count = a.find('em').getText().strip()
                if u'0' == count:
                    continue
                if u'对外投资' in title:
                    # 7. 企业对外投资信息
                    result['entinvItemList'] = self.parse_entinv(link + '_touzi')
                elif u'诉讼' in title:
                    # 诉讼信息
                    result['shixinList'], result['executedList'] = self.parse_susong(link + '_susong')
                elif u'年报' in title:
                    # 18. 企业年报(yearReport)
                    result['yearReportList'] = self.parse_year_report(link + '_report')
                elif u'商标' in title:
                    result['trademarkList'] = self.parse_mark(link + '_shangbiao')
                elif u'吐槽' in title:
                    result['commentList'] = self.parse_comment(link + '_comments')
                else:
                    print '[%s][%s]' % (title, count)
            except Exception, e:
                logger.exception(e)
            print '- ' * 100
        result['baikeList'] = self.parse_baike(link + '_baike')
        # 4. 失信被执行人信息(punishBreakList)
        # 5. 被执行人信息(punishedList)
        # 6. 阿里欠贷信息(alidebtList)
        # 8. 法定代表人对外投资信息(frinvList)
        # 9. 法定代表人在其他企业任职信息(frPositionList)
        # 11. 分支机构信息(filiationList)
        # 12. 行政处罚(caseInfoList)
        # 13. 股权冻结历史信息(sharesFrostList)
        # 14. 股权出质历史信息(sharesImpawnList)
        # 15. 动产质押信息(morDetailList)
        # 16. 动产抵押物信息(morguaInfoList)
        # 17. 清算信息(liquidationList)
        # print json.dumps(target, ensure_ascii=False, indent=4)

        result['_id'] = result['basicList'][0]['regNo']
        result['companyName'] = result['basicList'][0]['enterpriseName']
        result['province'] = self.province

        return result

    def save(self, save_data):
        # TODO
        return 0

    # old
    def run_old(self, mode=0, forced_latest=False):
        url = self.build_search_url()
        saved = 0
        while url:
            content = self.get_url_content(url)
            soup = BeautifulSoup(content)
            target_list = self.parse_target_list(soup)
            for target in target_list:
                save = self.save(self.crawl_one(target))
                if save:
                    saved += 1
                if mode == self.MODE_SINGLE:
                    return self.crawl_one(target, forced_latest)
            if mode == self.MODE_SINGLE or mode == self.MODE_PAGE:
                break
            if saved > 10:
                break
            url = self.find_next_page(soup)

    def check_url(self, url):
        content = self.request.get(url).content
        redirect = re.findall(ur"<script>location.href=[\"'](http://www.qichacha.com.+?)[\"'];</script>", content)
        if redirect:
            return redirect[0]
        return None

    # 线上实时爬取API
    def run_old2(self, mode=0, forced_latest=False, strictMode=False):
        url = self.build_search_url()
        checked = self.check_url(url)
        if checked:
            return self.crawl_one(checked, forced_latest)
        content = self.get_url_content(url)
        soup = BeautifulSoup(content)
        target_list = self.parse_target_list(soup)
        for target in target_list:
            # if u'分公司' in target['company']:
            #     continue
            if strictMode:
                if target['company'] != self.key:
                    continue
            return self.crawl_one(target['link'], forced_latest)
        return None

    # new
    def run(self, mode=0, forced_latest=False, strictMode=False):
        targets = self.check_name()
        if not targets:
            return None
        return self.crawl_one(targets[0])

    @classmethod
    def md5(cls, string):
        if isinstance(string, unicode):
            string = string.encode('utf-8')
        return hashlib.md5('f625a5b661058ba5082ca508f99ffe1b' + string).hexdigest()

    def check_name(self):
        postdata = {'key': self.key, 'token': self.md5(self.key)}
        try:
            jsonstr = self.post('http://qichacha.com/gongsi_getList', data=postdata, headers={
                'Referer': 'http://qichacha.com/',
                'X-Requested-With': 'XMLHttpRequest',
            })
            # jsonstr = self.request.post(, postdata, headers=).content
            data = json.loads(jsonstr.replace('Name', 'companyName'))
            # soup = BeautifulSoup(self.get_url_content(self.build_search_url()))
            # result = self.parse_target_list(soup)
            # for item in result:
            #     item['companyName'] = item['company']
            #     del item['company']
            return data
        except:
            return None


def main():
    crawler = QichachaCrawler(u'企业', QichachaCrawler.TYPE_COMPANY)
    print json.dumps(crawler.run(mode=QichachaCrawler.MODE_SINGLE), ensure_ascii=False, indent=4)


if __name__ == '__main__':
    print main()
